This is the companion R Markdown document to the following presentations that were delivered in Winter 2015:

Adding the TIQ-TEST functions

## Some limitations from not being an R package: Setting the Working directory
tiqtest.dir = file.path("..", "tiq-test")
current.dir = setwd(tiqtest.dir)
source("tiq-test.R")

## Setting the root data path to where it should be in this repo
.tiq.data.setRootPath(file.path(current.dir, "data"))
## INFO [2015-01-29 16:31:55 PST] pid=11961 tiq.data.setRootPath: Setting path to '/Users/alexcp/src/tiq-test-Winter2015/data'

Acessing the data using TIQ-TEST

We have roughly 2 months of data available on this public dataset:

print(tiq.data.getAvailableDates("raw", "public_outbound"))
##  [1] "20141001" "20141002" "20141003" "20141004" "20141005" "20141006"
##  [7] "20141007" "20141008" "20141009" "20141010" "20141011" "20141012"
## [13] "20141013" "20141014" "20141015" "20141016" "20141017" "20141018"
## [19] "20141019" "20141020" "20141021" "20141022" "20141023" "20141024"
## [25] "20141025" "20141026" "20141027" "20141028" "20141029" "20141030"
## [31] "20141031" "20141101" "20141102" "20141103" "20141104" "20141105"
## [37] "20141106" "20141107" "20141108" "20141109" "20141110" "20141111"
## [43] "20141112" "20141113" "20141114" "20141115" "20141116" "20141117"
## [49] "20141118" "20141119" "20141120" "20141121" "20141122" "20141123"
## [55] "20141124" "20141125" "20141126" "20141127" "20141128" "20141129"
## [61] "20141130"
print(tiq.data.getAvailableDates("raw", "public_inbound"))
##  [1] "20141001" "20141002" "20141003" "20141004" "20141005" "20141006"
##  [7] "20141007" "20141008" "20141009" "20141010" "20141011" "20141012"
## [13] "20141013" "20141014" "20141015" "20141016" "20141017" "20141018"
## [19] "20141019" "20141020" "20141021" "20141022" "20141023" "20141024"
## [25] "20141025" "20141026" "20141027" "20141028" "20141029" "20141030"
## [31] "20141031" "20141101" "20141102" "20141103" "20141104" "20141105"
## [37] "20141106" "20141107" "20141108" "20141109" "20141110" "20141111"
## [43] "20141112" "20141113" "20141114" "20141115" "20141116" "20141117"
## [49] "20141118" "20141119" "20141120" "20141121" "20141122" "20141123"
## [55] "20141124" "20141125" "20141126" "20141127" "20141128" "20141129"
## [61] "20141130"

This time, we also have a couple of private data feeds over some of this time, but the information in them cannot be shared publicly as a part of this release:

if (tiq.data.isDatasetAvailable("raw", "private1")) {
  print(tiq.data.getAvailableDates("raw", "private1"))
}
##  [1] "20141001" "20141002" "20141004" "20141005" "20141006" "20141007"
##  [7] "20141008" "20141009" "20141010" "20141011" "20141012" "20141013"
## [13] "20141014" "20141015" "20141016" "20141017" "20141018" "20141019"
## [19] "20141020" "20141021" "20141022" "20141023" "20141024" "20141025"
## [25] "20141026" "20141027" "20141028" "20141029" "20141030" "20141031"
## [31] "20141101" "20141102" "20141103" "20141104" "20141105" "20141106"
## [37] "20141107" "20141108" "20141109" "20141110" "20141111" "20141112"
## [43] "20141113" "20141114" "20141115" "20141116" "20141117" "20141118"
## [49] "20141119" "20141120" "20141121" "20141122" "20141123" "20141124"
## [55] "20141125" "20141126" "20141127" "20141128" "20141129" "20141130"

This is an example of “RAW” (not enriched) outbound data imported from combine output

outbound.ti = tiq.data.loadTI("raw", "public_outbound", "20141101")
outbound.ti[, list(entity, type, direction, source, date)]
##                          entity type direction     source       date
##     1:             1.168.15.140 IPv4  outbound alienvault 2014-11-01
##     2:                1.93.6.86 IPv4  outbound alienvault 2014-11-01
##     3:             100.42.211.4 IPv4  outbound alienvault 2014-11-01
##     4:           101.227.172.24 IPv4  outbound alienvault 2014-11-01
##     5:             101.36.81.55 IPv4  outbound alienvault 2014-11-01
##    ---                                                              
## 11388:          up.frigo2000.it FQDN  outbound       zeus 2014-11-01
## 11389:          update.odeen.eu FQDN  outbound       zeus 2014-11-01
## 11390: update.rifugiopontese.it FQDN  outbound       zeus 2014-11-01
## 11391:       vahendkarasis4.com FQDN  outbound       zeus 2014-11-01
## 11392:           welcahllyn.com FQDN  outbound       zeus 2014-11-01

We can use the same loadTI function to also gather the enriched datasets:

enrich.ti = tiq.data.loadTI("enriched", "public_outbound", "20141101")
enrich.ti = enrich.ti[, notes := NULL]
tail(enrich.ti)
##            entity type direction source       date asnumber
## 1:  94.102.63.153 IPv4  outbound   zeus 2014-11-01    29073
## 2:   94.103.36.55 IPv4  outbound   zeus 2014-11-01    47894
## 3:  95.163.121.12 IPv4  outbound   zeus 2014-11-01    12695
## 4: 98.131.185.136 IPv4  outbound   zeus 2014-11-01    32392
## 5: 98.131.185.136 IPv4  outbound   zeus 2014-11-01    32392
## 6:    99.181.5.83 IPv4  outbound   zeus 2014-11-01     7018
##                     asname country                       host
## 1:          Ecatel Network      NL                         NA
## 2: VeriTeknik Bilisim Ltd.      TR                         NA
## 3:   Digital Networks CJSC      RU                         NA
## 4:   Ecommerce Corporation      US                         NA
## 5:   Ecommerce Corporation      US projects.globaltronics.net
## 6:     AT&T Services, Inc.      US                         NA
##                                        rhost
## 1:                            exadomains.net
## 2:                 datacenter.veriteknik.com
## 3:                                        NA
## 4:                                        NA
## 5:                                        NA
## 6: adsl-99-181-5-83.dsl.irvnca.sbcglobal.net

This specific outbound dataset has the following sources included:

outbound.ti = tiq.data.loadTI("raw", "public_outbound", "20141101")
unique(outbound.ti$source)
##  [1] "alienvault"        "feodo"             "malcode"          
##  [4] "malcode_zones"     "malwaredomainlist" "malwaredomains"   
##  [7] "malwaregroup"      "palevotracker"     "spyeye"           
## [10] "sslbl"             "zeus"

We can do the same for the inbound data we have to see the sources we have available:

inbound.ti = tiq.data.loadTI("raw", "public_inbound", "20141101")
unique(inbound.ti$source)
##  [1] "alienvault"        "autoshun"          "blocklistde"      
##  [4] "botscout"          "bruteforceblocker" "charleshaley"     
##  [7] "ciarmy"            "dragonresearch"    "dshield"          
## [10] "honeypot"          "openbl"            "packetmail"       
## [13] "virbl"

Here are some results of running the Novelty test on the inbound data:

inbound.novelty = tiq.test.noveltyTest("public_inbound", "20141001", "20141130", 
                                             select.sources=c("alienvault", "blocklistde", 
                                                                "dshield", "charleshaley"),
                                                                             .progress=FALSE)
tiq.test.plotNoveltyTest(inbound.novelty, title="Novelty Test - Inbound Indicators")

And results running on the outbound data:

outbound.novelty = tiq.test.noveltyTest("public_outbound", "20141001", "20141130", 
                                        select.sources=c("alienvault", "malwaregroup", 
                                                         "malcode", "zeus"),
                                                                             .progress=FALSE)
tiq.test.plotNoveltyTest(outbound.novelty, title="Novelty Test - Outbound Indicators")

Overlap Test

This is an example of applying the Overlap Test to our inbound dataset

  overlap = tiq.test.overlapTest("public_inbound", "20141101", "enriched", 
                                 select.sources=NULL)
  overlap.plot = tiq.test.plotOverlapTest(overlap, title="Overlap Test - Inbound Data - 20141101")
  print(overlap.plot)

Similarly, an example applying the Overlap Test to the outbound dataset

  overlap = tiq.test.overlapTest("public_outbound", "20141101", "enriched", 
                                 select.sources=NULL)
  overlap.plot = tiq.test.plotOverlapTest(overlap, title="Overlap Test - Outbound Data - 20141101")
  print(overlap.plot)

With the population data we can generate some plot to compare the top quantities of reported IP addresses on a specific date by Country

  outbound.pop = tiq.test.extractPopulationFromTI("public_outbound", "country", 
                                                  date = "20141111",
                                                  select.sources=NULL, split.ti=F)
  inbound.pop = tiq.test.extractPopulationFromTI("public_inbound", "country", 
                                                 date = "20141111",
                                                 select.sources=NULL, split.ti=F)

  complete.pop = tiq.data.loadPopulation("mmgeo", "country")
## Warning in max(tiq.data.getAvailableDates(category, group)): no
## non-missing arguments, returning NA
## WARN [2015-01-29 16:33:36 PST] pid=11961 tiq.data.loadTI: path '/Users/alexcp/src/tiq-test-Winter2015/data/population/mmgeo/NA.csv.gz' is invalid. No data available on date 'NA'.
  tiq.test.plotPopulationBars(c(inbound.pop, outbound.pop, complete.pop), "country")

outbound.aging = tiq.test.agingTest("public_outbound", "20141001", "20141130")
tiq.test.plotAgingTest(outbound.aging)

inbound.aging = tiq.test.agingTest("public_inbound", "20141001", "20141130")
tiq.test.plotAgingTest(inbound.aging)

outbound.aging = tiq.test.agingTest("public_outbound", "20141001", "20141130",
                                    split.ti=F)
tiq.test.plotAgingTest(outbound.aging)

private.aging = tiq.test.agingTest("private1", "20141001", "20141130",
                                    split.ti=F)
## WARN [2015-01-29 16:37:24 PST] pid=11961 tiq.data.loadTI: path '/Users/alexcp/src/tiq-test-Winter2015/data/enriched/private1/20141003.csv.gz' is invalid. No data available on date '20141003'.
tiq.test.plotAgingTest(private.aging, density.limit=0.7)

  outbound.pop = tiq.test.extractPopulationFromTI("public_outbound", "country", 
                                                  date = "20141110",
                                                  select.sources=NULL, split.ti=F)
  private.pop = tiq.test.extractPopulationFromTI("private1", "country", 
                                                 date = "20141110",
                                                 select.sources=NULL, split.ti=F)

  tiq.test.plotPopulationBars(c(private.pop, outbound.pop), "country", title="Comparing Private1 and Public Feeds on 20141110")

private.novelty = tiq.test.noveltyTest("private1", "20141001", "20141130", split.tii=F,
                                                                             .progress=FALSE)
## WARN [2015-01-29 16:37:32 PST] pid=11961 tiq.data.loadTI: path '/Users/alexcp/src/tiq-test-Winter2015/data/raw/private1/20141003.csv.gz' is invalid. No data available on date '20141003'.
tiq.test.plotNoveltyTest(private.novelty)
## Warning: Stacking not well defined when ymin != 0

outbound.novelty = tiq.test.noveltyTest("public_outbound", "20141001", "20141130",split.tii=F,
                                                                             .progress=FALSE)
tiq.test.plotNoveltyTest(outbound.novelty)
## Warning: Stacking not well defined when ymin != 0

This is an example of applying the Overlap Test to our inbound dataset

  overlap = tiq.test.overlapTest("public_inbound", "20141101", "enriched", 
                                 select.sources=NULL)
  overlap.plot = tiq.test.plotOverlapTest(overlap, title="Overlap Test - Inbound Data - 20141101")
  print(overlap.plot)

  overlap = tiq.test.overlapTest(c("public_outbound", "private1"), "20141101", "enriched", 
                                 split.ti=F, select.sources=NULL)
  tiq.test.plotOverlapTest(overlap, title="OVERLAP - public_outbound VS private1 - 20141101")